In [220]:
# Imports from __future__ in case we're running Python 2
from __future__ import division, print_function
from __future__ import absolute_import, unicode_literals

#pandas
import pandas as pd

# Our numerical workhorses
import numpy as np
import scipy.integrate

# Import pyplot for plotting
import matplotlib.pyplot as plt

# Seaborn, useful for graphics
import seaborn as sns

#pretty tables with plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook


# Import Bokeh modules for interactive plotting
import bokeh.io
import bokeh.mpl
import bokeh.plotting

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline.  There is a bug, so uncomment if it works.
# %config InlineBackend.figure_formats = {'svg',}

# This enables high resolution PNGs. SVG is preferred, but has problems
# rendering vertical and horizontal lines
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()


BokehJS successfully loaded.

In [221]:
#load genotype data
fname= '../input/gandhi_et_al/130315_1A_genotypes.txt'
df_gt=pd.read_csv(fname, delimiter= '\t', comment= '#', header= [0,1])
#take a look
df_gt


Out[221]:
Genotype1 Genotype2 Genotype3
WT 17 Het 34 Mut 22
0 2 1 4
1 14 3 11
2 18 5 12
3 24 6 13
4 28 8 20
5 29 10 21
6 30 15 23
7 54 19 27
8 58 22 35
9 61 33 39
10 68 36 46
11 75 38 56
12 76 40 60
13 78 43 64
14 80 44 66
15 86 47 67
16 96 49 69
17 NaN 51 87
18 NaN 52 89
19 NaN 53 92
20 NaN 55 79
21 NaN 57 81
22 NaN 59 NaN
23 NaN 62 NaN
24 NaN 65 NaN
25 NaN 71 NaN
26 NaN 72 NaN
27 NaN 73 NaN
28 NaN 74 NaN
29 NaN 77 NaN
30 NaN 88 NaN
31 NaN 90 NaN
32 NaN 93 NaN
33 NaN 95 NaN

In [222]:
df_gt.columns


Out[222]:
MultiIndex(levels=[[u'Genotype1', u'Genotype2', u'Genotype3'], [u'Het 34', u'Mut 22', u'WT 17']],
           labels=[[0, 1, 2], [2, 0, 1]])

In [223]:
#remove multi indexing by reseting the columns to be the second level of the index
df_gt.columns= df_gt.columns.get_level_values(1)

#Check out the new columns
df_gt.columns


Out[223]:
Index([u'WT 17', u'Het 34', u'Mut 22'], dtype='object')

In [224]:
df_gt.columns= ['wt', 'het', 'mut'] #rename the columns

In [320]:
#this isn't tidy data.
#tidy data should have 2 columns: genotype and fish id
#let's use melt
df_gt= pd.melt(df_gt, var_name='genotype', value_name='fish')

df_gt.head()


Out[320]:
genotype fish
0 genotype wt
1 genotype wt
2 genotype wt
3 genotype wt
4 genotype wt

In [321]:
df_gt= df_gt.dropna()
df_gt.head()


Out[321]:
genotype fish
0 genotype wt
1 genotype wt
2 genotype wt
3 genotype wt
4 genotype wt

In [227]:
#load raw data
fname= '../input/gandhi_et_al/130315_01_rawDATA.txt'
df= pd.read_csv(fname, delimiter= '\t', comment= '#', header= [0,1 ])

#take a look
df.head()


Out[227]:
TIME(SECONDS) Unnamed: 1_level_0 FISH1 FISH2 FISH3 FISH4 FISH5 FISH6 FISH7 FISH8 ... FISH90 FISH91 FISH92 FISH93 FISH94 FISH95 FISH96 Unnamed: 98_level_0 Unnamed: 99_level_0 CLOCK
start end middur middur middur middur middur middur middur middur ... middur middur middur middur middur middur middur Unnamed: 98_level_1 Unnamed: 99_level_1 Unnamed: 100_level_1
0 0 60 0.6 1.4 0 0.0 0.0 0 0 1.7 ... 0.0 0 0.0 0 0.0 0 20.1 NaN NaN 9.519
1 60 120 1.9 0.0 0 0.0 0.0 0 0 0.0 ... 0.0 0 0.0 0 0.0 0 21.2 NaN NaN 9.536
2 120 180 1.9 0.0 0 1.5 0.1 0 0 0.0 ... 0.0 0 23.5 0 0.3 0 18.7 NaN NaN 9.553
3 180 240 13.4 0.0 0 4.5 0.0 0 0 0.0 ... 0.0 0 29.0 0 0.4 0 15.9 NaN NaN 9.569
4 240 300 15.4 0.0 0 0.0 0.0 0 0 0.0 ... 0.1 0 24.4 0 0.1 0 11.8 NaN NaN 9.586

5 rows × 101 columns


In [228]:
#make list of columns
#level 1 :2 -> start, end; level 0 2: -> FISH 1... CLOCK
df.columns= list(df.columns.get_level_values(1)[:2])\
                + list(df.columns.get_level_values(0)[2:])

In [229]:
df.columns


Out[229]:
Index([u'start', u'end', u'FISH1', u'FISH2', u'FISH3', u'FISH4', u'FISH5',
       u'FISH6', u'FISH7', u'FISH8',
       ...
       u'FISH90', u'FISH91', u'FISH92', u'FISH93', u'FISH94', u'FISH95',
       u'FISH96', u'Unnamed: 98_level_0', u'Unnamed: 99_level_0', u'CLOCK'],
      dtype='object', length=101)

In [230]:
cols_to_drop= df.columns[df.columns.str.contains('Unnamed')]
#drop the columns (axis 1 to drop cols, axis 0 for rows)
df= df.drop(cols_to_drop, axis= 1)

In [231]:
df= df.drop(['start', 'end'], axis= 1)

In [232]:
int('FISH42'.lstrip('FISH')) #how to separate number from FISH


Out[232]:
42

In [233]:
#Now loop through fish columns and drop ones that don't have genotypes
cols_to_drop= []
for col in df.columns:
    if 'FISH' in col and int(col.lstrip('FISH')) not in df_gt['fish'].values:
        cols_to_drop.append(col)

#drop em
df= df.drop(cols_to_drop, axis= 1)

In [234]:
#plot clock and t_start values
plt.plot(df.CLOCK, '.')
plt.xlabel('index')
plt.ylabel('time(hours)')


Out[234]:
<matplotlib.text.Text at 0x11ee621d0>

In [235]:
df['light']= pd.Series(df.CLOCK < 14.0, index= df.index)

In [236]:
dark_to_light= np.where(np.diff(df.light.astype(np.int)) == 1)[0]

#initialize array with day numbers
day= np.zeros_like(df.light, dtype= np.int)

for i in range(len(dark_to_light)-1):
    day[dark_to_light[i] +1: dark_to_light[i+1]+1]= i+1
day[dark_to_light[-1]+1:] = len(dark_to_light)
df['day']= pd.Series(day, index=df.index)

In [237]:
#build zeitgeiber clock
zeit= 24.0*df.day+df.CLOCK
df['zeit']= pd.Series(zeit, index= df.index)

In [238]:
genotypes= []

#check each column, put 'dummy' for non-fish column
for col in df.columns:
    if 'FISH' in col:
        fish_id= int(col.lstrip('FISH'))
#        print(fish_id, df_gt.genotype[df_gt.fish== fish_id].iloc[0])
        genotypes.append(df_gt.genotype[df_gt.fish== fish_id].iloc[0])
    else:
        genotypes.append(None)

#make a multi-index dataframe
df.columns= pd.MultiIndex.from_arrays((genotypes, df.columns),\
                                      names= ['genotype', 'variable'])

Tidy up the data first method


In [239]:
df.head()


Out[239]:
genotype het wt het mut het mut het mut het wt NaN
variable FISH1 FISH2 FISH3 FISH4 FISH5 FISH6 FISH8 FISH10 FISH11 FISH12 ... FISH89 FISH90 FISH92 FISH93 FISH95 FISH96 CLOCK light day zeit
0 0.6 1.4 0 0.0 0.0 0 1.7 0 0 0 ... 0 0.0 0.0 0 0 20.1 9.519 True 0 9.519
1 1.9 0.0 0 0.0 0.0 0 0.0 0 0 0 ... 0 0.0 0.0 0 0 21.2 9.536 True 0 9.536
2 1.9 0.0 0 1.5 0.1 0 0.0 0 0 0 ... 0 0.0 23.5 0 0 18.7 9.553 True 0 9.553
3 13.4 0.0 0 4.5 0.0 0 0.0 0 0 0 ... 0 0.0 29.0 0 0 15.9 9.569 True 0 9.569
4 15.4 0.0 0 0.0 0.0 0 0.0 0 0 0 ... 0 0.1 24.4 0 0 11.8 9.586 True 0 9.586

5 rows × 77 columns


In [240]:
#melt the dataframe 
#value_vars get turned into data
#id_vars remain as columns

#value variables are the ones with FISH in the name
col_1= df.columns.get_level_values(1)
value_vars= list(df.columns[col_1.str.contains('FISH')])

#id_vars are non fish entries
id_vars=  list(df.columns[~col_1.str.contains('FISH')])

#perform the melt
df1= pd.melt(df, value_vars= value_vars, id_vars= id_vars, value_name=\
            'activity', var_name= ['genotype', 'fish'])

In [241]:
df1.head()


Out[241]:
(nan, CLOCK) (nan, light) (nan, day) (nan, zeit) genotype fish activity
0 9.519 True 0 9.519 het FISH1 0.6
1 9.536 True 0 9.536 het FISH1 1.9
2 9.553 True 0 9.553 het FISH1 1.9
3 9.569 True 0 9.569 het FISH1 13.4
4 9.586 True 0 9.586 het FISH1 15.4

In [242]:
#rename tuple columns:
for i, col in enumerate(df1.columns):
    if type(col) is tuple:
        df1.rename(columns={col: col[1]}, inplace= True)

In [243]:
df1= df1[['fish', 'genotype', 'day', 'light', 'CLOCK', 'zeit', 'activity']]

In [244]:
df1.head()


Out[244]:
fish genotype day light CLOCK zeit activity
0 FISH1 het 0 True 9.519 9.519 0.6
1 FISH1 het 0 True 9.536 9.536 1.9
2 FISH1 het 0 True 9.553 9.553 1.9
3 FISH1 het 0 True 9.569 9.569 13.4
4 FISH1 het 0 True 9.586 9.586 15.4

Tidy up the data second method


In [245]:
df2= df.copy()
df2.head()


Out[245]:
genotype het wt het mut het mut het mut het wt NaN
variable FISH1 FISH2 FISH3 FISH4 FISH5 FISH6 FISH8 FISH10 FISH11 FISH12 ... FISH89 FISH90 FISH92 FISH93 FISH95 FISH96 CLOCK light day zeit
0 0.6 1.4 0 0.0 0.0 0 1.7 0 0 0 ... 0 0.0 0.0 0 0 20.1 9.519 True 0 9.519
1 1.9 0.0 0 0.0 0.0 0 0.0 0 0 0 ... 0 0.0 0.0 0 0 21.2 9.536 True 0 9.536
2 1.9 0.0 0 1.5 0.1 0 0.0 0 0 0 ... 0 0.0 23.5 0 0 18.7 9.553 True 0 9.553
3 13.4 0.0 0 4.5 0.0 0 0.0 0 0 0 ... 0 0.0 29.0 0 0 15.9 9.569 True 0 9.569
4 15.4 0.0 0 0.0 0.0 0 0.0 0 0 0 ... 0 0.1 24.4 0 0 11.8 9.586 True 0 9.586

5 rows × 77 columns


In [246]:
#zero level columns
# 'variable is the 2nd row of headings
cols= [col.replace('FISH', 'activity_')\
          for col in df2.columns.get_level_values('variable')]

df2.columns= pd.MultiIndex.from_arrays([df2.columns.get_level_values('genotype'),\
                                      cols], names= ['genotype', 'variable'])

In [247]:
df2.head()


Out[247]:
genotype het wt het mut het mut het mut het wt NaN
variable activity_1 activity_2 activity_3 activity_4 activity_5 activity_6 activity_8 activity_10 activity_11 activity_12 ... activity_89 activity_90 activity_92 activity_93 activity_95 activity_96 CLOCK light day zeit
0 0.6 1.4 0 0.0 0.0 0 1.7 0 0 0 ... 0 0.0 0.0 0 0 20.1 9.519 True 0 9.519
1 1.9 0.0 0 0.0 0.0 0 0.0 0 0 0 ... 0 0.0 0.0 0 0 21.2 9.536 True 0 9.536
2 1.9 0.0 0 1.5 0.1 0 0.0 0 0 0 ... 0 0.0 23.5 0 0 18.7 9.553 True 0 9.553
3 13.4 0.0 0 4.5 0.0 0 0.0 0 0 0 ... 0 0.0 29.0 0 0 15.9 9.569 True 0 9.569
4 15.4 0.0 0 0.0 0.0 0 0.0 0 0 0 ... 0 0.1 24.4 0 0 11.8 9.586 True 0 9.586

5 rows × 77 columns


In [248]:
#slice using cross sections
df2.xs('zeit', level= 'variable', axis= 1).head()


Out[248]:
genotype nan
0 9.519
1 9.536
2 9.553
3 9.569
4 9.586

In [249]:
df2['wt'].head()


Out[249]:
variable activity_2 activity_14 activity_18 activity_24 activity_28 activity_29 activity_30 activity_54 activity_58 activity_61 activity_68 activity_75 activity_76 activity_78 activity_80 activity_86 activity_96
0 1.4 0 0 0.0 0.8 1.9 0.0 0 0 0 0 0.3 0.0 0.0 0.1 4.1 20.1
1 0.0 0 0 0.0 0.0 0.0 0.1 0 0 0 0 0.1 0.0 0.0 0.0 0.0 21.2
2 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 1.2 0.1 0.0 0.1 0.0 18.7
3 0.0 0 0 0.1 0.0 0.0 0.1 0 0 0 0 0.0 0.0 0.0 0.0 0.0 15.9
4 0.0 0 0 0.0 0.0 0.0 0.0 0 0 0 0 0.6 0.0 0.4 0.0 0.0 11.8

The above dataframe is now tidied up, and you can use cross-sections to query is ( df.xs() ), but i prefer the first method

Preliminary exploration of data


In [250]:
# Get view into DataFrame with only fish 2
df_fish = df1[df1['fish'] == 'FISH2']

# Get figure and axis objects using plt.subplots utility function
fig, ax = plt.subplots()

# Plot activity versus time for fish 2.  lw denotes the line width
ax.plot(df_fish.zeit, df_fish.activity, '-', lw=0.25)

# Set axis labels
ax.set_xlabel('time (hr)')
ax.set_ylabel('activity (sec / min)')

# Set axis limits
ax.set_xlim((df_fish.zeit.min(), df_fish.zeit.max()))
ax.set_ylim((0, ax.get_ylim()[1]))

# We can overlay day and night.  We'll make night shaded.
# The where keyword is useful, and ~df_wt.light means "not light."
ax.fill_between(df_fish.zeit, 0, ax.get_ylim()[1], where=~df_fish.light, 
                color='gray',  alpha=0.3)

print('Fish 2 is', df_fish.genotype.iloc[0])


Fish 2 is wt

In [316]:
def pretty_activity_plot(ax, selector, selection, col, df, xlabel='time (hr)', 
                         ylabel='activity (sec / min)', lw=0.25, 
                         color=None):
    """
    Makes a pretty plot of sleep traces.  Generates the plot on axes ax,
    and then returns the updated ax.
    """
    # Make sure selection input is iterable
    if type(selection) in [str, int, float, unicode]:
        selection = [selection]
    
    # Plot time traces of column col for each fish
    for sel in selection:
        # Pull out record of interest
        df_plot = df[df[selector]==sel]
        # Generate plot
        if color is None:
            ax.plot(df_plot.zeit, df_plot[col], '-', lw=lw)
        else:
            ax.plot(df_plot.zeit, df_plot[col], '-', lw=lw, color=color)

    # Label axes
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    # Set axis limits
    ax.set_xlim((df_plot.zeit.min(), df_plot.zeit.max()))
    ax.set_ylim((0.0, ax.get_ylim()[1]))
    
    # Overlay night shading
    ax.fill_between(df_plot.zeit, 0.0, ax.get_ylim()[1], 
                    where=~df_plot.light, color='gray', alpha=0.3, zorder=0)    
    
    return ax

In [252]:
# Perform the rolling sum and return it as a new DataFrame
time_win = 10  # in units of no. of indices to sum over, equal to min here

# Perform rolling sum and store in window column
df1['window']= df1.groupby('fish')['activity'].apply(pd.rolling_sum, time_win)

In [304]:
fig, ax = plt.subplots()
ax = pretty_activity_plot(ax, 'fish', str('FISH2'), 'window', df1, lw=1,
                          ylabel='activity (sec/10 min)')



In [307]:
# Time points/lightness
t = df1.zeit[df1.fish=='FISH1']
light = df1.light[df1.fish=='FISH1']

# Index for right edge of first averaging win. (ensure win. ends at lights out)
start_ind = time_win + np.where(np.diff(light.astype(int)))[0][0] % time_win

# The time points we want
times = np.sort(np.unique(df1.zeit))[start_ind::time_win]

# The indicies of the tidy DataFrame we want
inds = df1.zeit.isin(times)

# New DataFrame
new_cols = ['fish', 'genotype', 'day', 'light', 'CLOCK', 'zeit', 'window']
df1_10 = df1[new_cols][inds]

# Rename window column to activity
df1_10 = df1_10.rename(columns={'window': 'activity'})

In [318]:
# Plot the results for non-overlapping ten minute intervals
fig, ax = plt.subplots()
ax = pretty_activity_plot(ax, 'fish', 'FISH2', 'activity', df1_10, lw=1, 
                          ylabel='activity (sec/10 min)')



In [319]:
# Save our new DataFrame to CSV file, using kwarg index=False to suppress
# writing of (trivial) indices
df1.to_csv('../input/130315_1_minute_intervals.csv', index=False)
df1_10.to_csv('../input/130315_10_minute_intervals.csv', index=False)